from DrissionPage import ChromiumPage
import datetime
import pandas as pd
import re


def remove_illegal_chars(value):
    if isinstance(value, str):
        # 去除非法字符
        return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', value)
    return value


def get_data():
    # 打开浏览器
    driver = ChromiumPage()
    # 监听数据包
    driver.listen.start("x/v2/reply/wbi/main")
    # 打开网页
    driver.get("https://www.bilibili.com/video/BV1oH4y1c7Kk/?spm_id_from=333.337.search-card.all.click&vd_source=32f5ffc41907f214a366192abe1d8da8")
    result = []
    for i in range(10):
        print(f"正在采集第{i + 1}页数据内容")
        # 翻页到底部
        driver.scroll.to_bottom()
        # 等待数据包加载
        resp = driver.listen.wait()
        # 直接获取数据包响应数据
        json_data = resp.response.body
        comments = json_data["data"]["replies"]
        for comment in comments:
            name = comment["member"]["uname"]
            time = comment["ctime"]
            date = datetime.datetime.fromtimestamp(time)
            text = comment["content"]["message"]
            # 数据清洗，去除非法字符
            text = remove_illegal_chars(text)
            dic = {
                "昵称": name,
                "时间": date,
                "评论": text
            }
            result.append(dic)
            print(dic)
    save(result)


def save(result):
    df = pd.DataFrame(result)
    df.to_excel("comments.xlsx", index=False)


if __name__ == '__main__':
    get_data()
